{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 垃圾邮件分类\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "> Anyone knows how much it costs to host a web portal ?\n",
      ">\n",
      "Well, it depends on how many visitors you're expecting.\n",
      "This can be anywhere from less than 10 bucks a month to a couple of $100. \n",
      "You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 \n",
      "if youre running something big..\n",
      "\n",
      "To unsubscribe yourself from this mailing list, send an email to:\n",
      "groupname-unsubscribe@egroups.com\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "with open('data/emailSample1.txt', 'r') as f:\n",
    "    sampe_email = f.read()\n",
    "    print(sampe_email)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. 对邮件进行预处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "'''\n",
    "预处理主要包括以下8个部分：\n",
    "  1. 将大小写统一成小写字母；\n",
    "  2. 移除所有HTML标签，只保留内容。\n",
    "  3. 将所有的网址替换为字符串 “httpaddr”.\n",
    "  4. 将所有的邮箱地址替换为 “emailaddr”\n",
    "  5. 将所有dollar符号($)替换为“dollar”.\n",
    "  6. 将所有数字替换为“number”\n",
    "  7. 将所有单词还原为词源，词干提取\n",
    "  8. 移除所有非文字类型\n",
    "  9.去除空字符串‘’\n",
    "'''\n",
    "\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from scipy.io import loadmat\n",
    "from sklearn import svm\n",
    "import nltk.stem as ns\n",
    "import re\n",
    "\n",
    "def preprocessing(email):\n",
    "    \n",
    "    # 1. 统一成小写\n",
    "    email = email.lower()\n",
    "    \n",
    "    #2. 去除html标签\n",
    "    email = re.sub('<[^<>]>', ' ', email)\n",
    "    \n",
    "    #3. 将网址替换为字符串 “httpaddr”.\n",
    "    email = re.sub('(http|https)://[^\\s]*', 'httpaddr', email ) \n",
    "    \n",
    "    #4. 将邮箱地址替换为 “emailaddr”\n",
    "    email = re.sub('[^\\s]+@[^\\s]+', 'emailaddr', email)\n",
    "    \n",
    "     # 5.所有dollar符号($)替换为“dollar”.\n",
    "    email = re.sub('[\\$]+', 'dollar', email) \n",
    "    \n",
    "    # 6.匹配数字，将数字替换为“number”\n",
    "    email = re.sub('[0-9]+', 'number', email) # 匹配一个数字， 相当于 [0-9]，+ 匹配1到多次\n",
    "    \n",
    "    # 7. 词干提取\n",
    "    tokens = re.split('[ \\@\\$\\/\\#\\.\\-\\:\\&\\*\\+\\=\\[\\]\\?\\!\\(\\)\\{\\}\\,\\'\\\"\\>\\_\\<\\;\\%]', email)\n",
    "    tokenlist=[]\n",
    "\n",
    "    s = ns.SnowballStemmer('english')\n",
    "        \n",
    "    for token in tokens:\n",
    "        \n",
    "        # 8. 移除非文字类型\n",
    "        email  = re.sub('[^a-zA-Z0-9]', '', email)\n",
    "        stemmed = s.stem(token)\n",
    "    \n",
    "        # 9.去除空字符串‘’\n",
    "        if not len(token): continue\n",
    "        tokenlist.append(stemmed)  \n",
    "        \n",
    "    return tokenlist\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "email = preprocessing(sampe_email)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def email2VocabIndices(email, vocab):\n",
    "    \"\"\"提取存在单词的索引\"\"\"\n",
    "    token = preprocessing(email)\n",
    "    print(token)\n",
    "    index = [i for i in range(len(token)) if token[i] in vocab]\n",
    "    return index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def email2FeatureVector(email):\n",
    "    \"\"\"\n",
    "    将email转化为词向量，n是vocab的长度。存在单词的相应位置的值置为1，其余为0\n",
    "    \"\"\"\n",
    "    df = pd.read_table('data/vocab.txt',names=['words'])\n",
    "    vocab = df.values  # return array\n",
    "    vector = np.zeros(len(vocab))  # init vector\n",
    "    vocab_indices = email2VocabIndices(email, vocab) \n",
    "    print(vocab_indices)# 返回含有单词的索引\n",
    "    # 将有单词的索引置为1\n",
    "    for i in vocab_indices:\n",
    "        vector[i] = 1\n",
    "    return vector"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['anyon', 'know', 'how', 'much', 'it', 'cost', 'to', 'host', 'a', 'web', 'portal', '\\n', '\\nwell', 'it', 'depend', 'on', 'how', 'mani', 'visitor', 'you', 're', 'expect', '\\nthis', 'can', 'be', 'anywher', 'from', 'less', 'than', 'number', 'buck', 'a', 'month', 'to', 'a', 'coupl', 'of', 'dollarnumb', '\\nyou', 'should', 'checkout', 'httpaddr', 'or', 'perhap', 'amazon', 'ecnumb', '\\nif', 'your', 'run', 'someth', 'big', '\\n\\nto', 'unsubscrib', 'yourself', 'from', 'this', 'mail', 'list', 'send', 'an', 'email', 'to', '\\nemailaddr\\n\\n']\n",
      "[0, 1, 2, 3, 4, 5, 6, 7, 9, 13, 14, 15, 16, 17, 19, 20, 21, 23, 24, 25, 26, 27, 28, 29, 32, 33, 35, 36, 37, 39, 41, 42, 43, 47, 48, 49, 50, 52, 53, 54, 56, 57, 58, 59, 60, 61]\n",
      "length of vector = 1899\n",
      "num of non-zero = 46\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "vector = email2FeatureVector(sampe_email)\n",
    "print('length of vector = {}\\nnum of non-zero = {}'.format(len(vector), int(vector.sum())))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1899,)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vector.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
